Note: This code is quite memory intensive due to the Google News dataset that is used. About 9 GB of RAM is used.


In [33]:
import re
from gensim import models
from scipy import spatial
import numpy as np
import os.path
import urllib
import gzip
import json
import pandas as pd

In [18]:
def search_tags(entity, search):
    """
    This function searches through all the 'tags' (semantic content) of a data set
    and returns 'true' if the search expression is found. case insensitive.
    """
    all_tags = '; '.join([str(x) for x in entity['tags'].values()])
    return bool(re.search(search, all_tags, flags=re.IGNORECASE))

In [14]:
def gunzipFile(inFileName, outFileName):
    inF = gzip.open(inFileName, 'rb')
    outF = open(outFileName, 'wb')
    outF.write( inF.read() )
    inF.close()
    outF.close()

In [3]:
# the idea for this code comes from this blog post:
# http://sujitpal.blogspot.nl/2015/09/sentence-similarity-using-word2vec-and.html
def sentenceDistance(sent1, sent2, stoplist):
    # remove all non-alphanumeric characters
    sent1 = re.sub('[^0-9a-zA-Z]+', ' ', sent1)
    sent2 = re.sub('[^0-9a-zA-Z]+', ' ', sent2)
    # split up the sentences into tokens, convert to lower case, and remove stopwords
    tokens1 = [word for word in sent1.lower().split() if word not in stoplist]
    tokens2 = [word for word in sent2.lower().split() if word not in stoplist]
    
    # get unique tokens
    tokens1 = list(set(tokens1))    
    tokens2 = list(set(tokens2))    
    
    # Need to get the shortest distances from all words in sent1 to a word in sent2
    # If there are matching words, then the distance is 0
    # If a synonym was found, then the distance should be small
    # The sum of these shortest distances for all words in sent1 is then returned as totalDist
    totalDist = 9999
    for token1 in tokens1:
        if model.vocab.has_key(token1):
            minDist = 9999
            for token2 in tokens2:
                if model.vocab.has_key(token2):
                    lv = model[token1]
                    rv = model[token2]
                    dist = spatial.distance.cosine(lv, rv)
                    # instead of cosine distance can also try euclidean distance
                    #dist = spatial.distance.euclidean(lv, rv)
                    if dist < minDist:
                        minDist = dist
            if minDist < 9999:
                if totalDist == 9999:
                    totalDist = minDist
                else: 
                    totalDist = totalDist + minDist 
    return(totalDist)

Load in the stopwords file. These are common words which we wish to exclude when performing comparisons (a, an, the, etc). Every word is separated by a new line.


In [4]:
stopWordsFile = "en.txt"
with open(stopWordsFile) as f:
    stoplist = [x.strip('\n') for x in f.readlines()]

We need to check if we have the word2vec model which has been pre-trained on the Google News corpus. The vectors are 300 dimentions and this is generated with a training set involving over 100 billion words

Note: This file is 1.6 GB compressed and expands to 3.4 GB


In [10]:
if os.path.isfile("GoogleNews-vectors-negative300.bin.gz") == False:
    # This is the direct download link for GoogleNews-vectors-negative300.bin.gz
    # If the link changes, just search for the filename as this is a file often used for word2vec
    downloadURL = 'https://doc-0g-8s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/dhu4deogg9hg0tkm9tdann504ue0vp91/1461232800000/06848720943842814915/*/0B7XkCwpI5KDYNlNUTTlSS21pQmM?e=download'
    urllib.urlretrieve (downloadURL, "GoogleNews-vectors-negative300.bin.gz")

Unzip the file. This may take a few several minutes due to the python gzip library. It may be quicker to just do this from the command line or do a system call.


In [11]:
if os.path.isfile("GoogleNews-vectors-negative300.bin") == False:
    gunzipFile('GoogleNews-vectors-negative300.bin.gz', 'GoogleNews-vectors-negative300.bin')

Create a model using this pre-trained data set


In [12]:
model = models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

Load in the data from the catalog


In [34]:
# http://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python
# need this to deal with unicode errors
def byteify(input):
    if isinstance(input, dict):
        return {byteify(key): byteify(value)
                for key, value in input.iteritems()}
    elif isinstance(input, list):
        return [byteify(element) for element in input]
    elif isinstance(input, unicode):
        return input.encode('utf-8')
    else:
        return input

gunzipFile('../catalogs/gabi_2016_professional-database-2016.json.gz', 
           '../catalogs/gabi_2016_professional-database-2016.json')
gunzipFile('../catalogs/uslci_ecospold.json.gz', 
           '../catalogs/uslci_ecospold.json')

with open('../catalogs/gabi_2016_professional-database-2016.json') as data_file:    
    gabi = json.load(data_file, encoding='utf-8')

with open('../catalogs/uslci_ecospold.json') as data_file:    
    uslci = json.load(data_file, encoding='utf-8')
    
gabi = byteify(gabi)
uslci = byteify(uslci)

In [78]:
roundwood = [flow for flow in uslci['flows'] if search_tags(flow,'roundwood, softwood')]
roundwoodExample = roundwood[0]

# number of top scores to show
numTopScores = 10

flowNames = []
distValues = []
for flow in gabi['archives'][0]['flows']:
    name = flow['tags']['Name']
    flowNames.append(name)
    dist = sentenceDistance(roundwoodExample['tags']['Name'], name, stoplist)
    distValues.append(dist)

len(flowNames)
    
# figure out top scores
arr = np.array(distValues)
topIndices = arr.argsort()[0:numTopScores]
topScores = np.array(distValues)[topIndices]

print 'Process name to match:'
print roundwoodExample['tags']['Name']

print 'Matches using Word2Vec:'
for i, s in zip(topIndices, topScores):
    if s < 9999:
        print(flowNames[i],s)


Process name to match:
Roundwood, softwood, average, at forest road, NE-NC
Matches using Word2Vec:
('Timber cedar (12% moisture; 10.7% H2O content) (m3)', 4.3803496516921951)
('Timber spruce (12% moisture; 10.7% H2O content)', 4.3803496516921951)
('Timber (12% moisture; 10.7% H2O content)', 4.3803496516921951)
('Timber pine (65% moisture; 40% H2O content)', 4.5251883015105836)
('Timber spruce (65% moisture; 40% H2O content)', 4.5282608796782853)
('Road (average)', 4.5344290329591512)
('Laminated veneer lumber (LVL)', 4.6418420007844299)
('Wood pellets (5.8% H2O content)', 4.6981787422455534)
('Waste incineration of untreated wood (10.7% H2O content)', 4.7205080571528928)
('Solid construction timber (15% moisture)', 4.7382955596134773)